###########################################
#  Primary Divisions: How Voters Evaluate Policy and Group Differences in Intra-Party Contests
#   - Forthcoming at The Journal of Politics
#   - Henderson et al 2021
#
###########################################
#  - code by S. Goggin & J. Henderson
########################################################
# This file produces voter sophistication leves/indices using knowledge and atttitude questions
########################################################
# inputs :: /data/data_matrix_scored.Rdata;
#				 :: /data/candidate_matrix.csv

# outputs ::
#        :: sophistication_indices.Rdata

#dirs="~/Dropbox/replication0/"
#dirs should be set here or in runR.R

rm(list=ls()[which(ls()!='dirs')])

library(ggplot2)
library(stringr)

# messy function to reorder by some estimate value
lableOrder=function(xmat,labels,label.groups,omits,o.column){

	# denote which label is to be omitted on the label
	for(i in 1:length(omits)){
		labels[which(labels==omits[i])]=paste('omit',labels[which(labels==omits[i])],sep='_')
	}

	# break groups into levels
	un_group=unique(label.groups)

	# this is the item to sort on, typically global or independent
	xm=xmat[,o.column]

	# vector which will contain row order
	xo=1:length(xm)

	# rearranging roworder within level
	for(j in 1:length(un_group)){
		ix=which(label.groups==un_group[j])
		if(length(ix)>2){
			ix=ix[!grepl(labels[ix],pattern='omit')]
			xo[ix]=xo[ix][order(xm[ix])]
		}
	}
	return(xmat[xo,])
}

reOrder=function(x,o){
	ix=array(NA,nrow(x))
	for(i in 1:length(o)){
		ix[i]=which(x$iv_order==o[i])
	}
	return(x[ix,])
}

setwd(dirs)

load('data/data_matrix_scored.Rdata')
scored=read.csv('data/candidate_matrix.csv')

###########################################
###########################################
# ATTENTION SCREENERS :: HELP ASSESS INATTENATIVENESS => CROSS-PRESSURE OR MIXED UP
###########################################
###########################################
# 1. differenes in attention for cross-pressure on 50-50
# 2. differenes in attention for cross-pressure on 25-75
# 3. differenes in attention for 'cross-pressure' on each issue, i.e.,
#    - a) out-party attitude on each issue
#    - b) quantile of each summary score
# 4. differences in attention for 'cross-pressure' on issues overall, i.e.,
#    - a) add up # of issues at 0 or out-party side
# 5. differenes in attention for cross-pressure on 25-25 split, most/least consistent
library(car)

###########################################
###Then, produce models w/ Clustered SEs

#Function from: http://scholar.byu.edu/jgubler/book/clustered-standard-errors-r
#Need this for clustered standard errors
clse.f <- function(dat,fm, cluster){
 require(sandwich)
 require(lmtest)
 not <- attr(fm$model,"na.action")
if( ! is.null(not)){
  cluster <- cluster[-not]
    dat <- dat[-not,]
}
 with(dat,{
 M <- length(unique(cluster))
 N <- length(cluster)
 K <- fm$rank
 dfc <- (M/(M-1))*((N-1)/(N-K))
 uj <- apply(estfun(fm),2, function(x) tapply(x, cluster, sum));
 vcovCL <- dfc*sandwich(fm, meat=crossprod(uj)/N)
 coeftest(fm, vcovCL)
 }
 )
}

# somewhat clumsily merging prior objects, edundant now but replicating to avoid possible indexing issues
data_matrix=data_matrix[,-c(grep(names(data_matrix),pattern='scores'))]
candidate_matrix=cbind(data_matrix,scored[,c('scores','scores_var','scores_policy','scores_policy_var')])

# i_policy is conjoint position in either i1 or i2
# consistency here is candidate too consistent Dem or Rep position on policy
# analogous to above variance in aggregating over candidate profiles on issues

candidate_matrix$consistency=
as.numeric(candidate_matrix$"i_lgbt"==1)*-1+
as.numeric(candidate_matrix$"i_marriage"==1)*1+
as.numeric(candidate_matrix$"i_need"==1)*-1+
as.numeric(candidate_matrix$"i_govabuse"==1)*1+
as.numeric(candidate_matrix$"i_freetrade"==1)*-1+
as.numeric(candidate_matrix$"i_unfairtrade"==1)*1+
as.numeric(candidate_matrix$"i_righttochoose"==1)*-1+
as.numeric(candidate_matrix$"i_unbornlives"==1)*1+
as.numeric(candidate_matrix$"i_raisetaxes"==1)*-1+
as.numeric(candidate_matrix$"i_cuttaxes"==1)*1+
as.numeric(candidate_matrix$"i_co2emissions"==1)*-1+
as.numeric(candidate_matrix$"i_drilling"==1)*1+
as.numeric(candidate_matrix$"i_citizenship"==1)*-1+
as.numeric(candidate_matrix$"i_bordersecurity"==1)*1+
as.numeric(candidate_matrix$"i_guncontrol"==1)*-1+
as.numeric(candidate_matrix$"i_gunrights"==1)*1+
as.numeric(candidate_matrix$"i_reducemilitary"==1)*-1+
as.numeric(candidate_matrix$"i_strengthenmilitary"==1)*1+
as.numeric(candidate_matrix$"i_policing"==1)*-1+
as.numeric(candidate_matrix$"i_criminals"==1)*1


candidate_matrix$libcon=rowMeans(cbind(candidate_matrix[,grep(names(candidate_matrix),pattern='libcon')],candidate_matrix$self_place/3))

############################################################
############################################################
### Build sophistication indices
############################################################
############################################################

voter_id=unique(candidate_matrix$respondent)
voter_matrix=as.data.frame(matrix(NA,length(voter_id),ncol(candidate_matrix)))
for(i in 1:length(voter_id)){
  ix=which(candidate_matrix$respondent==voter_id[i])[1]
  voter_matrix[i,]=candidate_matrix[ix,]
}
names(voter_matrix)=names(candidate_matrix)
# identify rank order of which components correlate with summary w/n each dimension
# which items are more consistently answered

# inference-type approach; pick strongest interactions among items in support * policy-specific conjoint items
#c("envs_policy1","envs_policy2","envs_policy3","envs_policy4",
#"crme_policy1","crme_policy2","crme_policy3","crme_policy4",
#"abrt_policy1","abrt_policy2","abrt_policy3","abrt_policy4","abrt_policy5","abrt_policy6",
#"gays_policy1","gays_policy1",
#"immi_policy1","immi_policy2","immi_policy3","immi_policy4","immi_policy5","immi_policy6","immi_policy7","immi_policy7",
#"roll_tpp_act","roll_trd_adj",
#"guns_policy1","guns_policy2","guns_policy3","guns_policy4",
#"dfns_policy2","roll_usa_fre","roll_iransct",
#"roll_educrfr","roll_infrast","roll_medicar","roll_rpl_aca","roll_minwage")

# frequency of mis-classifications across issues
id=which(voter_matrix$pid==-1)
ir=which(voter_matrix$pid==1)
xmu=c(as.numeric(voter_matrix$libcon_envs[id]>0)+
as.numeric(voter_matrix$libcon_crme[id]>0)+
as.numeric(voter_matrix$libcon_abrt[id]>0)+
as.numeric(voter_matrix$libcon_gays[id]>0)+
as.numeric(voter_matrix$libcon_immi[id]>0)+
as.numeric(voter_matrix$libcon_trad[id]>0)+
as.numeric(voter_matrix$libcon_guns[id]>0)+
as.numeric(voter_matrix$libcon_dfns[id]>0)+
as.numeric(voter_matrix$libcon_taxs[id]>0)+
as.numeric(voter_matrix$libcon_need[id]>0),
as.numeric(voter_matrix$libcon_envs[ir]<0)+
as.numeric(voter_matrix$libcon_crme[ir]<0)+
as.numeric(voter_matrix$libcon_abrt[ir]<0)+
as.numeric(voter_matrix$libcon_gays[ir]<0)+
as.numeric(voter_matrix$libcon_immi[ir]<0)+
as.numeric(voter_matrix$libcon_trad[ir]<0)+
as.numeric(voter_matrix$libcon_guns[ir]<0)+
as.numeric(voter_matrix$libcon_dfns[ir]<0)+
as.numeric(voter_matrix$libcon_taxs[ir]<0)+
as.numeric(voter_matrix$libcon_need[ir]<0))

ymu=c(as.numeric(voter_matrix$libcon_envs[id]<0)+
as.numeric(voter_matrix$libcon_crme[id]<0)+
as.numeric(voter_matrix$libcon_abrt[id]<0)+
as.numeric(voter_matrix$libcon_gays[id]<0)+
as.numeric(voter_matrix$libcon_immi[id]<0)+
as.numeric(voter_matrix$libcon_trad[id]<0)+
as.numeric(voter_matrix$libcon_guns[id]<0)+
as.numeric(voter_matrix$libcon_dfns[id]<0)+
as.numeric(voter_matrix$libcon_taxs[id]<0)+
as.numeric(voter_matrix$libcon_need[id]<0),
as.numeric(voter_matrix$libcon_envs[ir]>0)+
as.numeric(voter_matrix$libcon_crme[ir]>0)+
as.numeric(voter_matrix$libcon_abrt[ir]>0)+
as.numeric(voter_matrix$libcon_gays[ir]>0)+
as.numeric(voter_matrix$libcon_immi[ir]>0)+
as.numeric(voter_matrix$libcon_trad[ir]>0)+
as.numeric(voter_matrix$libcon_guns[ir]>0)+
as.numeric(voter_matrix$libcon_dfns[ir]>0)+
as.numeric(voter_matrix$libcon_taxs[ir]>0)+
as.numeric(voter_matrix$libcon_need[ir]>0))

names(xmu)=c(id,ir)
names(ymu)=c(id,ir)

# kinder and kalmoe + identify the 20-30% who score highest on ideological knowledge/consistency
k_gov=voter_matrix$know_gov
k_sen1=voter_matrix$know_senate1
k_sen2=voter_matrix$know_senate2

k_gov[which(voter_matrix$know_gov=='Republican')]=1
k_gov[grep(voter_matrix$know_gov,pattern='Independent')]=0
k_gov[which(voter_matrix$know_gov=='Democrat')]=-1
k_gov[!grepl(k_gov,pattern='[0,1]')]=-2

k_sen1[which(voter_matrix$know_senate1=='Republican')]=1
k_sen1[grep(voter_matrix$know_senate1,pattern='Independent')]=0
k_sen1[which(voter_matrix$know_senate1=='Democrat')]=-1
k_sen1[!grepl(k_sen1,pattern='[0,1]')]=-2

k_sen2[which(voter_matrix$know_senate2=='Republican')]=1
k_sen2[grep(voter_matrix$know_senate2,pattern='Independent')]=0
k_sen2[which(voter_matrix$know_senate2=='Democrat')]=-1
k_sen2[!grepl(k_sen2,pattern='[0,1]')]=-2

k_gov=as.numeric(k_gov)
k_sen1=as.numeric(k_sen1)
k_sen2=as.numeric(k_sen2)

sen2=sign(tapply(k_sen2[which(k_sen2> -2)],voter_matrix$state[which(k_sen2> -2)],mean,na.rm=T)) # sanders and angus king
sen2[which(names(sen2)=='Wyoming')]=1
sen2[which(names(sen2)=='Vermont')]=0
sen2[which(names(sen2)=='Maine')]=0
sen1=sign(tapply(k_sen1[which(k_sen1> -2)],voter_matrix$state[which(k_sen1> -2)],mean,na.rm=T))
gov=sign(tapply(k_gov[which(k_gov> -2)],voter_matrix$state[which(k_gov> -2)],mean,na.rm=T))

know_gov=know_senate2=know_senate1=array('Democrat',nrow(voter_matrix))
un_state=unique(voter_matrix$state)
for(j in 1:length(un_state)){
	if(un_state[j]!="District of Columbia"){

		ix=which(voter_matrix$state==un_state[j])

		if(gov[which(names(gov)==un_state[j])]==1){
			know_gov[ix]='Republican'
		}
		if(gov[which(names(gov)==un_state[j])]==0){
			know_gov[ix]='Other Party / Independent'
		}

		if(sen1[which(names(sen1)==un_state[j])]==1){
			know_senate1[ix]='Republican'
		}
		if(sen1[which(names(sen1)==un_state[j])]==0){
			know_senate1[ix]='Other Party / Independent'
		}

		if(sen2[which(names(sen2)==un_state[j])]==1){
			know_senate2[ix]='Republican'
		}
		if(sen2[which(names(sen2)==un_state[j])]==0){
			know_senate2[ix]='Other Party / Independent'
		}

	}
}


# knowledge battery
X=cbind(as.numeric(voter_matrix$know_senate1==know_senate1), 	# know party of sen1
as.numeric(voter_matrix$know_senate2==know_senate2),					# know party of sen2
as.numeric(voter_matrix$know_gov==know_gov),									# know party of gov
as.numeric(voter_matrix$us_house_majority=='Republicans'),		# know house majority
as.numeric(voter_matrix$us_senate_majority=='Republicans'),		# know senate majority
as.numeric(voter_matrix$self_place_attempt==1),								# attempt self place
as.numeric(voter_matrix$dem_place<voter_matrix$rep_place))		# place dems left of reps

colnames(X)=c('know_sen1','know_sen2','know_gov','know_house_maj','know_sen_maj','know_any_self_place','know_D_left_R')

#table iv: summary cces survey items to measure party knowledge
#library(xtable)
#print(xtable(as.matrix(colMeans(X,na.rm=T)[c(4,5,3,1,2,7,6)]),digits=3),file='~/Dropbox/IdeologyPrimaryConjoints/merged/replication0/appendix/figures/knowledge_summary.tex')
#Xl=cbind(
	#as.numeric(voter_matrix$know_senate1==know_senate1), 				# know party of sen1
	#as.numeric(voter_matrix$know_senate2==know_senate2),					# know party of sen2
	#as.numeric(voter_matrix$know_gov==know_gov),									# know party of gov
#as.numeric(voter_matrix$us_house_majority=='Republicans'),		# know house majority
#as.numeric(voter_matrix$us_senate_majority=='Republicans'),		# know senate majority
#as.numeric(voter_matrix$self_place_attempt==1),								# attempt self place
#as.numeric(voter_matrix$dem_place<voter_matrix$rep_place))		# place dems left of reps
#plot(table(rowSums(Xl)))
#mean(rowSums(Xl)/4)

####################################
# Scaling voters on attitudes and knowledge -- used to (a) assess consistency,
#  and (b) produce precise threshold breaks for H,M,L levels
# SCALING IRT KNOWLEDGE; account for harder/easier items
library(MCMCpack)

if(!file.exists('data/posterior2.Rdata')){
	posterior2 <- MCMCirt1d(X,
		theta.constraints=list(K="+", N="-"),
		B0.alpha=.2, B0.beta=.2,
		burnin=500, mcmc=5000, thin=20, verbose=500,
		store.item=TRUE,seed=1005
	)
	save(posterior2,file='data/posterior2.Rdata')
} else {
	load('data/posterior2.Rdata')
}

#geweke.diag(posterior2)

sumsK=summary(posterior2)
thetaK=-sumsK$statistics[,1][grep(names(sumsK$statistics[,1]),pattern='theta')]
#plot(rowSums(X),thetaK)
cor.test(rowSums(X),thetaK)

# policy battery
Y=voter_matrix[,c("envs_policy1","envs_policy2","envs_policy3","envs_policy4",
"crme_policy1","crme_policy2","crme_policy3","crme_policy4",
"abrt_policy1","abrt_policy2","abrt_policy3","abrt_policy4","abrt_policy5","abrt_policy6",
"gays_policy1",
"immi_policy1","immi_policy2","immi_policy3","immi_policy4","immi_policy5","immi_policy6","immi_policy7","immi_policy8",
"roll_tpp_act","roll_trd_adj",
"guns_policy1","guns_policy2","guns_policy3","guns_policy4",
"dfns_policy2","roll_usa_fre","roll_iransct",
"roll_educrfr","roll_infrast","roll_medicar","roll_rpl_aca","roll_minwage")]

#rownames(voter_matrix)[which(sign(voter_matrix$libcon)==-1)[1]]='L'
#rownames(voter_matrix)[which(sign(voter_matrix$libcon)==1)[1]]='R'
#rownames(voter_matrix)[which(rowSums(X)==0)[20]]='N'
#rownames(voter_matrix)[which(rowSums(X)==7)[20]]='K'

if(!file.exists('data/posterior1.Rdata')){
	posterior1 <- MCMCirt1d(Y,
		theta.constraints=list(R="+", L="-"),
		B0.alpha=.2, B0.beta=.2,
		burnin=500, mcmc=5000, thin=20, verbose=500,
		store.item=TRUE,seed=1005
	)
	save(posterior1,file='data/posterior1.Rdata')
} else {
	load('data/posterior1.Rdata')
}
#geweke.diag(posterior1)
#plot(posterior1)
sums1=summary(posterior1)
theta1=-sums1$statistics[,1][grep(names(sums1$statistics[,1]),pattern='theta')]
#plot(voter_matrix$libcon,theta1)
#cor.test(voter_matrix$libcon,theta1)

thetaZ=theta1

########################################################
########################################################
########################################################
# Descriptive Analyses and Figures Here
########################################################
########################################################
########################################################

#################################
#### plot density/hist of X correct + frequencies
# amoung partisans
ii=c(id,ir)
Xo=X[ii,]
summary(Xo)


################################################################################
################################################################################
# inter item correlations and reliability benchmarks
################################################################################
################################################################################



################################################################################
################################################################################
# below is getting to analysis =>
#		step 1: classify empirical proportion of (un)sophisticates from our measures
#   step 2: produce stratifications on our scaled attitudes, natural knowledge scales
#   	=> X, scaled(Y), X by scaled(Y)
################################################################################
################################################################################

# K&K as useful guide

#########################################
#1. Use the self-placement scale
mean(abs(voter_matrix$self_place[c(id,ir)])==3)
#[1] 0.2414475
mean(abs(voter_matrix$self_place[c(id,ir)])>=2)
#[1] 0.5501285
mean(abs(voter_matrix$self_place[c(id,ir)])>=1)
#[1] 0.7741744

#########################################
#2. Knowledge
table(rowSums(Xo))/sum(table(rowSums(Xo)))
#0           1           2           3           4           5           6           7
#0.009689539 0.048447696 0.062487641 0.082855448 0.099861578 0.126952739 0.169863555 0.399841803
# 57% 6 or 7 correct; 40%  get all 7

#########################################
#3. Policy (in)consistency on 10 issue areas
table(xmu)/sum(table(xmu))
table(ymu)/sum(table(ymu))

c(mean(ymu==10),mean(ymu>=9),mean(ymu>=8),mean(ymu>=7),
mean(ymu>=6),mean(ymu>=5),mean(ymu>=4),mean(ymu>=3),
mean(ymu>=2),mean(ymu>=1))

c(mean(xmu==0),mean(xmu<=1),mean(xmu<=2),mean(xmu<=3),
mean(xmu<=4),mean(xmu<=5),mean(xmu<=6),mean(xmu<=7),
mean(xmu<=8),mean(xmu<=9))

#########################################
#4. Stability; aggregated estimates using pre/post

#########################################
#5. Interaction is take top scorers or knowledge and consistency
#c(id,ir)[which(xmu<=0)] #15%
#c(id,ir)[which(rowSums(Xo)>=7)] #40%
#c(id,ir)[which(xmu<=0&rowSums(Xo)>=7)] #9%

#c(id,ir)[which(xmu<=1)] #44%
#c(id,ir)[which(rowSums(Xo)>=6)] #57%
#c(id,ir)[which(xmu<=1 & rowSums(Xo)>=6)] #33%

# reasonable enough, though can also use scaling below, to get to 20 to 30% ....
rnk=(rank(thetaK[c(id,ir)])+rank(abs(thetaZ)[c(id,ir)]))/2
#quantile(rnk,prob=c(.7,.8))
#c(id,ir)[which(rnk>=3411.5)] #30% thresh
#c(id,ir)[which(rnk>=3788.4)] #20% thresh
#rnk2=(xmu/10+rowSums(Xo)/7)/2

#included1=array(0,length(rnk))
#included2=array(0,length(rnk))

H_included1 = rnk>=quantile(rnk,prob=c(.7,.8))[1]
H_included2 = xmu<=1 & rowSums(Xo)>=6

#mean(xmu<=1 & rowSums(Xo)>=6)
#[1] 0.3296421

# included sophisticateds
#c(id,ir)[H_included1]
#c(id,ir)[H_included2]

# excluded non-sophisticateds
#c(id,ir)[!H_included1]
#c(id,ir)[!H_included2]

#quantile(rnk,prob=c(.2,.3))
L_included1 = rnk<=quantile(rnk,prob=c(.2,.3))[2]
L_included2 = xmu>=2 & rowSums(Xo)<=5

mean(xmu>=2 & rowSums(Xo)<=5)
#[1] 0.3177773

# alternatively take the average of these; makes no real difference
#H_included2 = rnk2>quantile(rnk2,prob=c(.3,.7))[2]
#L_included2 = rnk2<=quantile(rnk2,prob=c(.3,.7))[1]
# excluded least-sophisticateds
#c(id,ir)[L_included1]
#c(id,ir)[L_included2]

M_included1 = !H_included1 & !L_included1
M_included2 = !H_included2 & !L_included2
mean(M_included2)
#[1] 0.3525806

#voter_matrix$respondent[c(id,ir)]
#voter_matrix$pid3[c(id,ir)]

indices=list(
	'resp_id'=voter_matrix$respondent,
	'num_inconsistent'=xmu,
	'num_consistent'=ymu,
	'know7'=sort(c(id,ir)[which(rowSums(Xo)>=7)]),
	'know67'=sort(c(id,ir)[which(rowSums(Xo)>=6)]),
	'know0to5'=sort(c(id,ir)[which(rowSums(Xo)<=5)]),
	'know0to2'=sort(c(id,ir)[which(rowSums(Xo)<=2)]),
	'incon01'=sort(c(id,ir)[which(xmu<=1)]),
	'incon2to10'=sort(c(id,ir)[which(xmu>=2)]),
	'incon4to10'=sort(c(id,ir)[which(xmu>=4)]),
	'H_ix1'=sort(c(id,ir)[H_included1]),'H_ix2'=sort(c(id,ir)[H_included2]),
	'M_ix1'=sort(c(id,ir)[M_included1]),'M_ix2'=sort(c(id,ir)[M_included2]),
	'L_ix1'=sort(c(id,ir)[L_included1]),'L_ix2'=sort(c(id,ir)[L_included2])
)

save(indices,
	file='data/sophistication_indices.Rdata'
)

#END

#END 4_build_sophistication_indices

# trying to keep everything together in the data
# -- cces_stacked in data/cces_stacked_unmatched.Rdata
# -- voter_matrix in data/voter_matrix.csv
# -- candidate_matrix in data/candidate_matrix.csv (revised)
# -- core_for_ggplot in data/core_for_ggplot_rep_global.csv
# -- core_for_ggplot in data/core_for_ggplot_dem_global.csv (for dems)
# -- core_for_ggplot in data/core_for_ggplot_rep_global.csv (for reps)
# -- all_score,rep_score,dem_score, in data/scoreMats.Rdata
# -- data_matrix (just combining revised candidate_matrix & voter_matrix) in data_matrix_scored.Rdata

# -- indices in indices sophistication_indices.Rdata
# -- irt models to measure latent knowledge and attitudes in posterior1.Rdata and posterior2.Rdata
